# 处理全是txt文件的语料，合并

import glob  # glob是python自己带的一个文件操作相关模块，用它可以查找符合自己目的的文件，类似于Windows下的文件搜索
import os

path = r'/data/hj_workspace/tgt_wiki'
f = r'/data/corpus_hj/synonymous_80000.txt'


# read all txt files and save all columns to new_file
def read_write_file(path, f):
    f2 = open(f, 'a+')
    i = 0
    for im in glob.glob(path + '/*.txt'):
        i += 1
        if i % 1000 == 0:
            print("执行了 %d 个文件" % i)
        with open(im, 'r') as f1:
            lines = f1.readlines()
            if len(lines) == 2:
                f2.write(lines[0].strip() + "\t" + lines[1].strip() + "\n")
            else:
                print("这个文件不是2行，过滤掉", im)
    f2.close()


if __name__ == '__main__':
    read_write_file(path, f)
